with open('conversion_data.csv') as f:
for num, line in enumerate(f):
if num > 5:
break
print (num, line)
import pandas as pd
df = pd.read_csv('conversion_data.csv')
df.describe(include='all')
df.head()
df[df.age>70]
df[df.age<20]
from bokeh.charts import Histogram, output_notebook, show
from bokeh.resources import Resources
resource = Resources(mode='inline')
output_notebook(resources=resource)
p = Histogram(df['age'], bins=30, title="Age Distribution (30 bins)")
show(p)
# Remove "strange" records
df = df[df.age<80]
df.head()
output_notebook(resources=resource)
p = Bar(df, 'country', values='converted', agg='mean', title="Conversion Rate by Country")
show(p)
import numpy as np
grouped = df.loc[:, ['country', 'converted']].groupby('country')
date_country = grouped.mean()
date_country.index
#grouped.groups
#grouped.sum()
# data_pages = grouped.aggregate(np.mean)
# data_pages
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
ind = np.arange(len(df.country.unique()))
width = 0.5
plt.xkcd()
# fig = plt.figure()
fig, ax = plt.subplots()
ax.bar(ind, date_country.converted, width, color="black")
ax.set_title("Conversion Rate by Country")
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(date_country.index)
output_notebook(resources=resource)
p = Bar(df, 'source', values='converted', agg='mean', color="wheat", title="Conversion Rate by Source")
show(p)
output_notebook(resources=resource)
p = Bar(df, 'new_user', values='converted', agg='mean', color="green", title="Conversion Rate by New User")
show(p)
grouped = df.loc[:, ['age', 'converted']].groupby('age')
data_age = grouped.mean()
data_age["Age"] = data_age.index
# data_age
from bokeh.charts import Line, output_notebook, show
output_notebook(resources=resource)
p = Line(data_age, x='Age', y='converted', color="blue", title="Conversion Rate by Age",
plot_width=900, plot_height=400)
show(p)
grouped = df.loc[:, ['total_pages_visited', 'converted']].groupby('total_pages_visited')
#grouped.groups
#grouped.sum()
data_pages = grouped.aggregate(np.mean)
data_pages
output_notebook(resources=resource)
p = Line(data_pages, title="Conversion Rate vs Total Pages Visited", legend="top_left", ylabel="Conversion Rate")
show(p)
# from sklearn import preprocessing
# le_country = preprocessing.LabelEncoder()
# le_country.fit(df['country'])
# print (list(le_country.classes_))
# df['country_encoded'] = le_country.transform(df['country'])
# le_source = preprocessing.LabelEncoder()
# le_source.fit(df['source'])
# print (list(le_source.classes_))
# df['source_encoded'] = le_source.transform(df['source'])
country_encoded, country_index = pd.factorize(df['country'])
df['country_encoded'] = country_encoded
source_encoded, source_index = pd.factorize(df['source'])
df['source_encoded'] = source_encoded
df.head()
from sklearn.cross_validation import train_test_split
# split 80/20 train-test
x_train, x_test, y_train, y_test = train_test_split(df.loc[:, ['country_encoded', 'age', 'new_user', 'source_encoded', 'total_pages_visited']],
df.converted,
test_size=0.2,
random_state=1)
x_train.columns
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, oob_score=True)
clf.fit(x_train, y_train)
clf.oob_score_
clf.n_features_
importance = pd.DataFrame({"feature": pd.Categorical(x_train.columns), "importance": clf.feature_importances_})
output_notebook(resources=resource)
p = Bar(importance, label="feature", values="importance", color="orange", title="Feature importance")
show(p)
preds = clf.predict(x_test)
pd.crosstab(y_test, preds, rownames=['actual'], colnames=['preds'])
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
print ("Accuracy:", accuracy_score(y_test, preds) )
print ("Confusion Matrix:\n", confusion_matrix(y_test, preds) )
fpr, tpr, thresholds = roc_curve(y_test, preds)
fpr
tpr
thresholds
%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label='Random Forests')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()
auc(fpr, tpr)
from bokeh.plotting import figure, show, output_notebook
output_notebook(resources=resource)
p = figure(title="Receiver Operating Characteristic",
y_range=(0.0, 1.05))
p.line(fpr, tpr, legend="Random Forests")
show(p)
%matplotlib inline
import matplotlib.pyplot as plt
from collections import OrderedDict
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.cross_validation import train_test_split
plt.figure(1, figsize=(12, 8))
RANDOM_STATE = 123
NTREES = 100
# split 80/20 train-test
x_train, x_test, y_train, y_test = train_test_split(df.loc[:, ['country_encoded',
'age',
'new_user',
'source_encoded',
'total_pages_visited']],
df.converted,
test_size=0.2,
random_state=RANDOM_STATE)
ensemble_clfs = [
("RandomForestClassifier, max_features='sqrt'",
RandomForestClassifier(warm_start=True, n_estimators=NTREES, oob_score=True,
max_features="sqrt",
random_state=RANDOM_STATE))
]
# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)
# Range of `n_estimators` values to explore.
min_estimators = 15
max_estimators = 200
for label, clf in ensemble_clfs:
for i in range(min_estimators, max_estimators + 1):
clf.set_params(n_estimators=i)
clf.fit(x_train, y_train)
# Record the OOB error for each `n_estimators=i` setting.
oob_error = 1 - clf.oob_score_
error_rate[label].append((i, oob_error))
# Generate the "OOB error rate" vs. "n_estimators" plot.
for label, clf_err in error_rate.items():
xs, ys = zip(*clf_err)
plt.plot(xs, ys, label=label)
plt.xlim(min_estimators, max_estimators)
plt.xlabel("n_estimators")
plt.ylabel("OOB error rate")
plt.legend(loc="upper right")
# plt.show()
%matplotlib inline
import matplotlib.pyplot as plt
from collections import OrderedDict
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.cross_validation import train_test_split
plt.figure(1, figsize=(12, 8))
RANDOM_STATE = 123
NTREES = 100
# split 80/20 train-test
x_train, x_test, y_train, y_test = train_test_split(df.loc[:, ['country_encoded',
'age',
'new_user',
'source_encoded',
'total_pages_visited']],
df.converted,
test_size=0.2,
random_state=RANDOM_STATE)
ensemble_clfs = [
("RandomForestClassifier, max_features='sqrt'",
RandomForestClassifier(warm_start=True, n_estimators=NTREES, oob_score=True,
max_features="sqrt",
random_state=RANDOM_STATE)),
("RandomForestClassifier, max_features='log2'",
RandomForestClassifier(warm_start=True, n_estimators=NTREES, max_features='log2',
oob_score=True,
random_state=RANDOM_STATE)),
("RandomForestClassifier, max_features=None",
RandomForestClassifier(warm_start=True, n_estimators=NTREES, max_features=None,
oob_score=True,
random_state=RANDOM_STATE))
]
# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)
# Range of `n_estimators` values to explore.
min_estimators = 15
max_estimators = 200
for label, clf in ensemble_clfs:
for i in range(min_estimators, max_estimators + 1):
clf.set_params(n_estimators=i)
clf.fit(x_train, y_train)
# Record the OOB error for each `n_estimators=i` setting.
oob_error = 1 - clf.oob_score_
error_rate[label].append((i, oob_error))
# Generate the "OOB error rate" vs. "n_estimators" plot.
for label, clf_err in error_rate.items():
xs, ys = zip(*clf_err)
plt.plot(xs, ys, label=label)
plt.xlim(min_estimators, max_estimators)
plt.xlabel("n_estimators")
plt.ylabel("OOB error rate")
plt.legend(loc="upper right")
# plt.show()
# from sklearn.ensemble import RandomForestClassifier
# features = df.columns[[6,1,2,7]]
# list(features)
from sklearn.cross_validation import train_test_split
# split 80/20 train-test
x_train, x_test, y_train, y_test = train_test_split(df.loc[:, ['country_encoded', 'age', 'new_user', 'source_encoded']],
df.converted,
test_size=0.2,
random_state=1)
x_train.columns
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(x_train, y_train)
importance = pd.DataFrame({"feature": pd.Categorical(x_train.columns), "importance": clf.feature_importances_})
from bokeh.charts import Bar, output_notebook, show
output_notebook(resources=resource)
p = Bar(importance, label="feature", values="importance", color="gray", title="Feature importance")
show(p)
preds = clf.predict(x_test)
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
print ("Accuracy:", accuracy_score(y_test, preds) )
print ("Confusion Matrix:\n", confusion_matrix(y_test, preds) )
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(clf, x_test, y_test)
scores.mean()